<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <title>spark数据倾斜及解决方案 | iworkh blog</title>
  
    <meta name="keywords" content=" spark ">
  
  <meta name="description" content="spark数据倾斜及解决方案 | iworkh blog">
    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="description" content="个人简介沐雨云楼，程序员一名。 喜欢研究技术，主要从事JAVA、微服务、大数据领域。 联系方式 QQ : 157162006 邮箱 :157162006@qq.com">
<meta property="og:type" content="website">
<meta property="og:title" content="关于">
<meta property="og:url" content="https://iworkh.gitee.io/blog/about/index.html">
<meta property="og:site_name" content="iworkh blog">
<meta property="og:description" content="个人简介沐雨云楼，程序员一名。 喜欢研究技术，主要从事JAVA、微服务、大数据领域。 联系方式 QQ : 157162006 邮箱 :157162006@qq.com">
<meta property="og:locale" content="zh_CN">
<meta property="article:published_time" content="2020-05-17T07:22:16.000Z">
<meta property="article:modified_time" content="2020-09-12T13:21:47.523Z">
<meta property="article:author" content="沐雨云楼">
<meta property="article:tag" content="iworkh">
<meta property="article:tag" content="沐雨云楼">
<meta property="article:tag" content="blog">
<meta name="twitter:card" content="summary">


<link rel="icon" href="/blog/img/iwork.svg">

<link href="/blog/css/style.css?v=1.0.1" rel="stylesheet">

<link href="/blog/css/hl_theme/atom-light.css?v=1.0.1" rel="stylesheet">

<link href="//cdn.bootcdn.net/ajax/libs/animate.css/4.1.0/animate.min.css" rel="stylesheet">
<link href="//cdn.bootcdn.net/ajax/libs/font-awesome/5.13.0/js/fontawesome.min.js" rel="stylesheet">

<script src="//cdn.bootcdn.net/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="/blog/js/jquery.autocomplete.min.js?v=1.0.1" ></script>
<script src="/blog/js/titleTip.js?v=1.0.1" ></script>

<script src="//cdn.bootcdn.net/ajax/libs/highlight.js/10.0.3/highlight.min.js"></script>
<script>
    hljs.initHighlightingOnLoad();
</script>

<script src="//cdn.bootcdn.net/ajax/libs/nprogress/0.2.0/nprogress.min.js"></script>



<script src="//cdn.bootcdn.net/ajax/libs/jquery-cookie/1.4.1/jquery.cookie.min.js" ></script>

<script src="/blog/js/iconfont.js?v=1.0.1" ></script>

<meta name="generator" content="Hexo 4.2.1"></head>
<div style="display: none">
  <input class="theme_disqus_on" value="false">
  <input class="theme_preload_comment" value="false">
  <input class="theme_blog_path" value="/blog">
</div>




<body>
<aside class="nav">
    <div class="nav-left">
        <a href="/blog/" class="avatar_target">
    <img class="avatar" src="/blog/img/iwork.svg" />
</a>
<div class="author">
    <span>沐雨云楼</span>
</div>

<div class="icon">
    
        
        <a title="github" href="https://gitee.com/iworkh" target="_blank">
            
                <svg class="iconfont-svg" aria-hidden="true">
                    <use xlink:href="#icon-github"></use>
                </svg>
            
        </a>
        
    
        
        <a title="csdn" href="https://blog.csdn.net/u011622109" target="_blank">
            
                <svg class="iconfont-svg" aria-hidden="true">
                    <use xlink:href="#icon-csdn"></use>
                </svg>
            
        </a>
        
    
        
        <a title="email" href="mailto:157162006@qq.com" target="_blank">
            
                <svg class="iconfont-svg" aria-hidden="true">
                    <use xlink:href="#icon-email"></use>
                </svg>
            
        </a>
        
    
        
        <a title="qq" href="http://wpa.qq.com/msgrd?v=3&uin=157162006&site=qq&menu=yes" target="_blank">
            
                <svg class="iconfont-svg" aria-hidden="true">
                    <use xlink:href="#icon-qq"></use>
                </svg>
            
        </a>
        
    
        
        <a title="kugou" href="https://www.kugou.com/" target="_blank">
            
                <svg class="iconfont-svg" aria-hidden="true">
                    <use xlink:href="#icon-kugou"></use>
                </svg>
            
        </a>
        
    
        
        <a title="neteasemusic" href="https://music.163.com/" target="_blank">
            
                <svg class="iconfont-svg" aria-hidden="true">
                    <use xlink:href="#icon-neteasemusic"></use>
                </svg>
            
        </a>
        
    
</div>



<a class="more-menus">更多菜单</a>


<ul>
    <li><div class="all active">全部文章<small>(85)</small></div></li>
    
        
            
            <li><div data-rel="手册"><i class="fold iconfont icon-right"></i>手册<small>(2)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="iworkh">iworkh<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="pgmanor">pgmanor<small>(1)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="工具"><i class="fold iconfont icon-right"></i>工具<small>(16)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="git">git<small>(2)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="github">github<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="hexo">hexo<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="idea">idea<small>(2)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="markdowm">markdowm<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="maven">maven<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="有道云">有道云<small>(1)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="java"><i class="fold iconfont icon-right"></i>java<small>(20)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="dao">dao<small>(6)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="jvm">jvm<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="test">test<small>(5)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="tools">tools<small>(3)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="并发">并发<small>(1)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="微服务"><i class="fold iconfont icon-right"></i>微服务<small>(6)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="spring">spring<small>(2)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="springboot">springboot<small>(3)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="架构">架构<small>(2)</small></div>
                
            </li>
            
        
    
        
            
            <li><div data-rel="大数据"><i class="fold iconfont icon-right"></i>大数据<small>(4)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="hadoop">hadoop<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="spark">spark<small>(3)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="web"><i class="fold iconfont icon-right"></i>web<small>(3)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="vue">vue<small>(2)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="db"><i class="fold iconfont icon-right"></i>db<small>(4)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="mysql">mysql<small>(3)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="redis">redis<small>(1)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="ai">ai<small>(4)</small></div>
                
            </li>
            
        
    
        
            
            <li><div data-rel="python"><i class="fold iconfont icon-right"></i>python<small>(8)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="tornado">tornado<small>(3)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="linux"><i class="fold iconfont icon-right"></i>linux<small>(2)</small></div>
                
                    <ul class="sub hide">
                        
                        <li><div data-rel="docker">docker<small>(1)</small></div>
                            
                        </li>
                            
                        <li><div data-rel="nginx">nginx<small>(1)</small></div>
                            
                        </li>
                            
                    </ul>
                
            </li>
            
        
    
        
            
            <li><div data-rel="windows">windows<small>(3)</small></div>
                
            </li>
            
        
    
        
            
            <li><div data-rel="结构和算法">结构和算法<small>(8)</small></div>
                
            </li>
            
        
    
        
            
            <li><div data-rel="面试">面试<small>(2)</small></div>
                
            </li>
            
        
    
        
            
            <li><div data-rel="其他">其他<small>(1)</small></div>
                
            </li>
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
        
            
        
    
</ul>
<div class="left-bottom">
    <div class="menus">
    
    
    
    <a class="dynamic-menu " target="_blank"  href="https://pgmanor.gitee.io/blog/">pgmanor</a>
    
    
    <a class="dynamic-menu " target="_blank"  href="https://www.iworkh.com/manualIt/Category/scopeDev/">iworkh</a>
    
    
    <a class="dynamic-menu " target="_blank"  href="https://gitee.com/iworkh">gitee</a>
    
    </div>
    <div><a class="about  hasFriend  site_url"  href="/blog/about">关于</a><a style="width: 50%"  class="friends">友链</a></div>
</div>
<input type="hidden" id="iworkh_site_posts_number" value="85">
<input type="hidden" id="iworkh_site_word_count" value="197.8k">
<div style="display: none">
    <span id="busuanzi_value_site_uv"></span>
    <span id="busuanzi_value_site_pv"></span>
</div>

    </div>
    <div class="nav-right">
        <div class="friends-area">
    <div class="friends-title">
        友情链接
        <i class="back-title-list"></i>
    </div>
    <div class="friends-content">
        <ul>
            
            <li><a target="_blank" href="https://www.iworkh.com">iworkh</a></li>
            
            <li><a target="_blank" href="https://www.iworkh.com/download/share/">下载地址</a></li>
            
            <li><a target="_blank" href="http://yelog.org/">叶落阁</a></li>
            
        </ul>
    </div>
</div>
        <div class="title-list">
    <form onkeydown="if(event.keyCode === 13){return false;}">
        <input id="local-search-input" class="search" type="text" placeholder="in: 开头全文搜索" />
        <i class="cross"></i>
        <span>
            <label for="tagswitch">T:</label>
            <input id="tagswitch" type="checkbox" style="display: none" />
            <i id="tagsWitchIcon"></i>
        </span>
    </form>
    <div class="tags-list">
    
    <li class="article-tag-list-item">
        <a class="color3">ai</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">attr</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">aysnc</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">beanmap</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">bigdecimal</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">blog</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">chrome</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">dict</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">dll</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">docker</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">domain</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">fastjson</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">func</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">git</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">github</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">hadoop</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">hexo</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">idea</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">iworkh</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">jackson</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">java</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">jdni</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">jfinalshell</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">joplin</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">JPA</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">lock</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">markdowm</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">maven</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">mock</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">mq</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">mycat</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">mysql</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">nginx</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">notebook</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">plan</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">powermock</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color2">python</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">redis</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">rest</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">sharding-jdbc</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">spark</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">springboot</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">test</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">tool</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">tools</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">tornado</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">transactional</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">web</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">windows</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">wtforms</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">zookeeper</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">分库分表</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">分页</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">命令</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">工具</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">并发</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">序列化</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">微服务</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">性能分析</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">手册</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color4">有道云</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">标签</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">模板</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color5">百度云盘</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color1">结构和算法</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">计划</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">迁移</a>
    </li>
    
    <li class="article-tag-list-item">
        <a class="color3">面试</a>
    </li>
    
    <div class="clearfix"></div>
</div>

    
    <div id="local-search-result">

    </div>
    
    <nav id="title-list-nav">
        
        <a id="top" class="手册 pgmanor "
           href="/blog/2020/12/15/redirect-to-pgmanor/"
           data-tag="迁移"
           data-author="沐雨云楼" >
            <span class="post-title" title="😘博客迁移">😘博客迁移</span>
            <span class="post-date" title="2020-12-15 23:47:55">2020/12/15</span>
        </a>
        
        <a id="top" class="手册 iworkh "
           href="/blog/2020/05/27/manual-iworkh/"
           data-tag="手册,iworkh"
           data-author="沐雨云楼" >
            <span class="post-title" title="iworkh所有手册">iworkh所有手册</span>
            <span class="post-date" title="2020-05-27 21:37:04">2020/05/27</span>
        </a>
        
        <a id="top" class="微服务 springboot "
           href="/blog/2020/06/27/spring_boot_all_resources/"
           data-tag="springboot"
           data-author="沐雨云楼" >
            <span class="post-title" title="Spring Boot全网优质教程汇总">Spring Boot全网优质教程汇总</span>
            <span class="post-date" title="2020-06-27 17:43:13">2020/06/27</span>
        </a>
        
        <a id="top" class="工具 "
           href="/blog/2020/06/22/baidu-pandisk-share/"
           data-tag="百度云盘"
           data-author="沐雨云楼" >
            <span class="post-title" title="百度云盘分享链接">百度云盘分享链接</span>
            <span class="post-date" title="2020-06-22 18:38:21">2020/06/22</span>
        </a>
        
        <a id="top" class="工具 "
           href="/blog/2020/06/21/tool_blogs_all/"
           data-tag="blog"
           data-author="沐雨云楼" >
            <span class="post-title" title="优质博客和文章">优质博客和文章</span>
            <span class="post-date" title="2020-06-21 20:02:14">2020/06/21</span>
        </a>
        
        <a id="top" class="java jvm "
           href="/blog/2020/06/04/java-online-analysis/"
           data-tag="性能分析"
           data-author="沐雨云楼" >
            <span class="post-title" title="JAVA线上故障排查全套路">JAVA线上故障排查全套路</span>
            <span class="post-date" title="2020-06-04 20:34:05">2020/06/04</span>
        </a>
        
        <a id="top" class="工具 idea "
           href="/blog/2020/05/31/jetbrains-idea/"
           data-tag="工具,idea"
           data-author="沐雨云楼" >
            <span class="post-title" title="idea使用技巧">idea使用技巧</span>
            <span class="post-date" title="2020-05-31 16:12:30">2020/05/31</span>
        </a>
        
        <a id="top" class="工具 有道云 "
           href="/blog/2020/05/18/youdao_config_edit/"
           data-tag="工具,有道云"
           data-author="沐雨云楼" >
            <span class="post-title" title="有道云配置修改">有道云配置修改</span>
            <span class="post-date" title="2020-05-18 21:37:04">2020/05/18</span>
        </a>
        
        <a id="top" class="web "
           href="/blog/2020/06/01/web-tools/"
           data-tag="web,tool"
           data-author="沐雨云楼" >
            <span class="post-title" title="前端常用工具">前端常用工具</span>
            <span class="post-date" title="2020-06-01 21:37:04">2020/06/01</span>
        </a>
        
        <a  class="工具 "
           href="/blog/2020/10/25/tool-linux-jfinalshell/"
           data-tag="jfinalshell"
           data-author="沐雨云楼" >
            <span class="post-title" title="linux连接工具jfinalshell">linux连接工具jfinalshell</span>
            <span class="post-date" title="2020-10-25 10:24:59">2020/10/25</span>
        </a>
        
        <a  class="微服务 springboot "
           href="/blog/2020/10/06/spring-boot-multi-profiles/"
           data-tag="springboot"
           data-author="沐雨云楼" >
            <span class="post-title" title="springboot多环境配置打包">springboot多环境配置打包</span>
            <span class="post-date" title="2020-10-06 21:36:42">2020/10/06</span>
        </a>
        
        <a  class="web vue "
           href="/blog/2020/10/06/vue_springboot_package_publish/"
           data-tag="maven"
           data-author="沐雨云楼" >
            <span class="post-title" title="vue+springboot打包发布">vue+springboot打包发布</span>
            <span class="post-date" title="2020-10-06 14:22:29">2020/10/06</span>
        </a>
        
        <a  class="java dao "
           href="/blog/2020/10/06/java-jpa-query/"
           data-tag="JPA"
           data-author="沐雨云楼" >
            <span class="post-title" title="JPA自定义sql">JPA自定义sql</span>
            <span class="post-date" title="2020-10-06 10:20:38">2020/10/06</span>
        </a>
        
        <a  class="微服务 spring "
           href="/blog/2020/10/03/spring-pageable/"
           data-tag="分页"
           data-author="沐雨云楼" >
            <span class="post-title" title="vue+springboot分页交互">vue+springboot分页交互</span>
            <span class="post-date" title="2020-10-03 11:08:21">2020/10/03</span>
        </a>
        
        <a  class="工具 "
           href="/blog/2020/10/02/chrome-plugin/"
           data-tag="chrome"
           data-author="沐雨云楼" >
            <span class="post-title" title="chrome插件">chrome插件</span>
            <span class="post-date" title="2020-10-02 10:11:31">2020/10/02</span>
        </a>
        
        <a  class="windows "
           href="/blog/2020/10/01/windows-less-dll/"
           data-tag="dll"
           data-author="沐雨云楼" >
            <span class="post-title" title="windows缺少dll">windows缺少dll</span>
            <span class="post-date" title="2020-10-01 20:22:28">2020/10/01</span>
        </a>
        
        <a  class="java "
           href="/blog/2020/09/13/java-base-bigdecimal/"
           data-tag="bigdecimal"
           data-author="沐雨云楼" >
            <span class="post-title" title="BigDecimal正确使用了吗?">BigDecimal正确使用了吗?</span>
            <span class="post-date" title="2020-09-13 10:28:54">2020/09/13</span>
        </a>
        
        <a  class="windows "
           href="/blog/2020/09/12/windows-dev-install/"
           data-tag="windows"
           data-author="沐雨云楼" >
            <span class="post-title" title="windows开发环境安装">windows开发环境安装</span>
            <span class="post-date" title="2020-09-12 21:39:01">2020/09/12</span>
        </a>
        
        <a  class="工具 "
           href="/blog/2020/09/06/tool-notebook-joplin/"
           data-tag="joplin"
           data-author="沐雨云楼" >
            <span class="post-title" title="joplin笔记">joplin笔记</span>
            <span class="post-date" title="2020-09-06 20:50:55">2020/09/06</span>
        </a>
        
        <a  class="web vue "
           href="/blog/2020/09/05/vue-element-admin/"
           data-tag="web"
           data-author="沐雨云楼" >
            <span class="post-title" title="vue element admin开发">vue element admin开发</span>
            <span class="post-date" title="2020-09-05 11:01:38">2020/09/05</span>
        </a>
        
        <a  class="工具 "
           href="/blog/2020/09/05/tool-notebook/"
           data-tag="notebook"
           data-author="沐雨云楼" >
            <span class="post-title" title="常见笔记软件">常见笔记软件</span>
            <span class="post-date" title="2020-09-05 10:54:34">2020/09/05</span>
        </a>
        
        <a  class="java dao "
           href="/blog/2020/08/30/java-dao-shardingSphere/"
           data-tag="sharding-jdbc"
           data-author="沐雨云楼" >
            <span class="post-title" title="分库分表中间件-ShardingSphere">分库分表中间件-ShardingSphere</span>
            <span class="post-date" title="2020-08-30 21:52:45">2020/08/30</span>
        </a>
        
        <a  class="java dao "
           href="/blog/2020/08/30/java-dao-mycat/"
           data-tag="mycat"
           data-author="沐雨云楼" >
            <span class="post-title" title="分库分表中间件-MyCat">分库分表中间件-MyCat</span>
            <span class="post-date" title="2020-08-30 15:01:59">2020/08/30</span>
        </a>
        
        <a  class="java dao "
           href="/blog/2020/08/30/java-dao-sharding-db/"
           data-tag="分库分表"
           data-author="沐雨云楼" >
            <span class="post-title" title="分库分表中间件-开源产品">分库分表中间件-开源产品</span>
            <span class="post-date" title="2020-08-30 09:37:18">2020/08/30</span>
        </a>
        
        <a  class="java dao "
           href="/blog/2020/08/29/java-jndi/"
           data-tag="jdni"
           data-author="沐雨云楼" >
            <span class="post-title" title="JNDI">JNDI</span>
            <span class="post-date" title="2020-08-29 17:12:03">2020/08/29</span>
        </a>
        
        <a  class="工具 git "
           href="/blog/2020/08/27/tool-git-setting/"
           data-tag="git"
           data-author="沐雨云楼" >
            <span class="post-title" title="git配置">git配置</span>
            <span class="post-date" title="2020-08-27 21:30:17">2020/08/27</span>
        </a>
        
        <a  class="java test "
           href="/blog/2020/08/23/java-powermock/"
           data-tag="powermock"
           data-author="沐雨云楼" >
            <span class="post-title" title="powermock测试框架">powermock测试框架</span>
            <span class="post-date" title="2020-08-23 12:45:17">2020/08/23</span>
        </a>
        
        <a  class="java test "
           href="/blog/2020/08/23/java-mockito/"
           data-tag="mock"
           data-author="沐雨云楼" >
            <span class="post-title" title="👍mockito测试框架">👍mockito测试框架</span>
            <span class="post-date" title="2020-08-23 10:45:17">2020/08/23</span>
        </a>
        
        <a  class="java test "
           href="/blog/2020/08/22/java-test-ng/"
           data-tag="test"
           data-author="沐雨云楼" >
            <span class="post-title" title="👍TestNG一篇足以">👍TestNG一篇足以</span>
            <span class="post-date" title="2020-08-22 18:37:28">2020/08/22</span>
        </a>
        
        <a  class="db mysql "
           href="/blog/2020/08/19/mysql-common-sql/"
           data-tag="mysql"
           data-author="沐雨云楼" >
            <span class="post-title" title="mysql常用sql">mysql常用sql</span>
            <span class="post-date" title="2020-08-19 21:30:00">2020/08/19</span>
        </a>
        
        <a  class="db mysql "
           href="/blog/2020/08/19/mysql-windows-zip-install/"
           data-tag="mysql"
           data-author="沐雨云楼" >
            <span class="post-title" title="mysql zip在windows上安装">mysql zip在windows上安装</span>
            <span class="post-date" title="2020-08-19 21:00:47">2020/08/19</span>
        </a>
        
        <a  class="工具 idea "
           href="/blog/2020/08/15/jetbrains-crack/"
           data-tag="工具,idea"
           data-author="沐雨云楼" >
            <span class="post-title" title="jetbrains全家桶破解">jetbrains全家桶破解</span>
            <span class="post-date" title="2020-08-15 19:22:33">2020/08/15</span>
        </a>
        
        <a  class="工具 git "
           href="/blog/2020/07/10/tool-git-error-name-too-long/"
           data-tag="git"
           data-author="沐雨云楼" >
            <span class="post-title" title="git文件名太长">git文件名太长</span>
            <span class="post-date" title="2020-07-10 12:03:02">2020/07/10</span>
        </a>
        
        <a  class="db mysql "
           href="/blog/2020/07/04/mysql-lock-unlock-table/"
           data-tag="lock"
           data-author="沐雨云楼" >
            <span class="post-title" title="mysql数据库锁查询和释放">mysql数据库锁查询和释放</span>
            <span class="post-date" title="2020-07-04 21:46:54">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-sort/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法8-排序">数据结构和算法8-排序</span>
            <span class="post-date" title="2020-07-04 18:49:12">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-search/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法7-搜索">数据结构和算法7-搜索</span>
            <span class="post-date" title="2020-07-04 17:48:19">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-graph/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法6-非线性-图">数据结构和算法6-非线性-图</span>
            <span class="post-date" title="2020-07-04 16:09:47">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-tree/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法5-非线性-树">数据结构和算法5-非线性-树</span>
            <span class="post-date" title="2020-07-04 15:09:47">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-linear-queue/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法4-线性-队列">数据结构和算法4-线性-队列</span>
            <span class="post-date" title="2020-07-04 14:10:22">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-linear-stack/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法3-线性-栈">数据结构和算法3-线性-栈</span>
            <span class="post-date" title="2020-07-04 13:00:17">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-linear-list/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法2-线性-链">数据结构和算法2-线性-链</span>
            <span class="post-date" title="2020-07-04 11:32:12">2020/07/04</span>
        </a>
        
        <a  class="结构和算法 "
           href="/blog/2020/07/04/data-structures-algorithms-info/"
           data-tag="结构和算法"
           data-author="沐雨云楼" >
            <span class="post-title" title="数据结构和算法1-介绍">数据结构和算法1-介绍</span>
            <span class="post-date" title="2020-07-04 10:10:40">2020/07/04</span>
        </a>
        
        <a  class="大数据 spark "
           href="/blog/2020/06/30/spark-install-windows/"
           data-tag="spark"
           data-author="沐雨云楼" >
            <span class="post-title" title="spark开发环境搭建">spark开发环境搭建</span>
            <span class="post-date" title="2020-06-30 19:24:51">2020/06/30</span>
        </a>
        
        <a  class="大数据 hadoop "
           href="/blog/2020/06/30/hadoop-install-windows/"
           data-tag="hadoop"
           data-author="沐雨云楼" >
            <span class="post-title" title="hadoop开发环境搭建">hadoop开发环境搭建</span>
            <span class="post-date" title="2020-06-30 18:06:23">2020/06/30</span>
        </a>
        
        <a  class="java "
           href="/blog/2020/06/30/java-install-windows/"
           data-tag="java"
           data-author="沐雨云楼" >
            <span class="post-title" title="jdk开发环境搭建">jdk开发环境搭建</span>
            <span class="post-date" title="2020-06-30 15:09:34">2020/06/30</span>
        </a>
        
        <a  class="ai "
           href="/blog/2020/06/30/ai-python-spark-env/"
           data-tag="spark"
           data-author="沐雨云楼" >
            <span class="post-title" title="python spark开发环境搭建">python spark开发环境搭建</span>
            <span class="post-date" title="2020-06-30 12:14:04">2020/06/30</span>
        </a>
        
        <a  class="ai "
           href="/blog/2020/06/29/ai-jupyter-notebook/"
           data-tag="标签"
           data-author="沐雨云楼" >
            <span class="post-title" title="jupyter notebook">jupyter notebook</span>
            <span class="post-date" title="2020-06-29 12:29:36">2020/06/29</span>
        </a>
        
        <a  class="ai "
           href="/blog/2020/06/26/ai-python-tools/"
           data-tag="tools"
           data-author="沐雨云楼" >
            <span class="post-title" title="python和ai常用工具">python和ai常用工具</span>
            <span class="post-date" title="2020-06-26 09:55:24">2020/06/26</span>
        </a>
        
        <a  class="python "
           href="/blog/2020/06/25/python-tool-buildin-attr/"
           data-tag="attr"
           data-author="沐雨云楼" >
            <span class="post-title" title="👍python内置属性(魔法方法)">👍python内置属性(魔法方法)</span>
            <span class="post-date" title="2020-06-25 11:01:33">2020/06/25</span>
        </a>
        
        <a  class="python "
           href="/blog/2020/06/24/python-tool-buildin-func/"
           data-tag="func"
           data-author="沐雨云楼" >
            <span class="post-title" title="👍python内置函数">👍python内置函数</span>
            <span class="post-date" title="2020-06-24 18:31:59">2020/06/24</span>
        </a>
        
        <a  class="python "
           href="/blog/2020/06/24/python-tool-obj-dict/"
           data-tag="dict"
           data-author="沐雨云楼" >
            <span class="post-title" title="python对象与dict互转">python对象与dict互转</span>
            <span class="post-date" title="2020-06-24 18:15:03">2020/06/24</span>
        </a>
        
        <a  class="大数据 spark "
           href="/blog/2020/06/21/spark-version-3/"
           data-tag="spark"
           data-author="沐雨云楼" >
            <span class="post-title" title="spark3.0新特性">spark3.0新特性</span>
            <span class="post-date" title="2020-06-21 17:40:12">2020/06/21</span>
        </a>
        
        <a  class="java "
           href="/blog/2020/06/21/java-base-domain/"
           data-tag="domain"
           data-author="沐雨云楼" >
            <span class="post-title" title="VO DTO DO PO你都了解吗?">VO DTO DO PO你都了解吗?</span>
            <span class="post-date" title="2020-06-21 09:20:03">2020/06/21</span>
        </a>
        
        <a  class="大数据 spark "
           href="/blog/2020/06/20/spark-data-skew/"
           data-tag="spark"
           data-author="沐雨云楼" >
            <span class="post-title" title="spark数据倾斜及解决方案">spark数据倾斜及解决方案</span>
            <span class="post-date" title="2020-06-20 21:41:12">2020/06/20</span>
        </a>
        
        <a  class="linux nginx "
           href="/blog/2020/06/20/nginx-fast/"
           data-tag="nginx"
           data-author="沐雨云楼" >
            <span class="post-title" title="nginx为何这么快?">nginx为何这么快?</span>
            <span class="post-date" title="2020-06-20 17:55:10">2020/06/20</span>
        </a>
        
        <a  class="架构 "
           href="/blog/2020/06/20/fw-mq-message-missing/"
           data-tag="mq"
           data-author="沐雨云楼" >
            <span class="post-title" title="MQ如何保证消息不丢失?">MQ如何保证消息不丢失?</span>
            <span class="post-date" title="2020-06-20 17:03:18">2020/06/20</span>
        </a>
        
        <a  class="java "
           href="/blog/2020/06/20/java-base-serializable/"
           data-tag="序列化"
           data-author="沐雨云楼" >
            <span class="post-title" title="java序列化">java序列化</span>
            <span class="post-date" title="2020-06-20 16:29:04">2020/06/20</span>
        </a>
        
        <a  class="架构 "
           href="/blog/2020/06/20/fw-distributed-lock-redis-zk/"
           data-tag="lock,redis,zookeeper"
           data-author="沐雨云楼" >
            <span class="post-title" title="分布式锁用redis还是zk?">分布式锁用redis还是zk?</span>
            <span class="post-date" title="2020-06-20 12:09:55">2020/06/20</span>
        </a>
        
        <a  class="java tools "
           href="/blog/2020/06/19/java-tool-jackson/"
           data-tag="jackson"
           data-author="沐雨云楼" >
            <span class="post-title" title="工具类--jackson">工具类--jackson</span>
            <span class="post-date" title="2020-06-19 13:00:45">2020/06/19</span>
        </a>
        
        <a  class="java tools "
           href="/blog/2020/06/19/java-tool-bean-map/"
           data-tag="beanmap"
           data-author="沐雨云楼" >
            <span class="post-title" title="工具类--bean和map互转">工具类--bean和map互转</span>
            <span class="post-date" title="2020-06-19 12:30:14">2020/06/19</span>
        </a>
        
        <a  class="java test "
           href="/blog/2020/06/18/java-rest-assured-in-action/"
           data-tag="rest"
           data-author="沐雨云楼" >
            <span class="post-title" title="rest-assured实战">rest-assured实战</span>
            <span class="post-date" title="2020-06-18 17:51:56">2020/06/18</span>
        </a>
        
        <a  class="java test "
           href="/blog/2020/06/17/java_rest_assured_wiki_info/"
           data-tag="rest"
           data-author="沐雨云楼" >
            <span class="post-title" title="rest-assured wiki翻译">rest-assured wiki翻译</span>
            <span class="post-date" title="2020-06-17 20:39:11">2020/06/17</span>
        </a>
        
        <a  class="python tornado "
           href="/blog/2020/06/16/tornado-wtforms/"
           data-tag="wtforms"
           data-author="沐雨云楼" >
            <span class="post-title" title="wtforms表单验证">wtforms表单验证</span>
            <span class="post-date" title="2020-06-16 21:52:33">2020/06/16</span>
        </a>
        
        <a  class="python tornado "
           href="/blog/2020/06/16/tornado-sync-to-async/"
           data-tag="aysnc"
           data-author="沐雨云楼" >
            <span class="post-title" title="tornado同步转异步">tornado同步转异步</span>
            <span class="post-date" title="2020-06-16 21:50:25">2020/06/16</span>
        </a>
        
        <a  class="python tornado "
           href="/blog/2020/06/08/python_tornado_info/"
           data-tag="tornado"
           data-author="沐雨云楼" >
            <span class="post-title" title="👍Tornado入门这一篇足以">👍Tornado入门这一篇足以</span>
            <span class="post-date" title="2020-06-08 20:31:32">2020/06/08</span>
        </a>
        
        <a  class="工具 maven "
           href="/blog/2020/06/07/tool-maven/"
           data-tag="tools,工具"
           data-author="沐雨云楼" >
            <span class="post-title" title="maven常用命令和配置">maven常用命令和配置</span>
            <span class="post-date" title="2020-06-07 16:30:11">2020/06/07</span>
        </a>
        
        <a  class="微服务 springboot "
           href="/blog/2020/06/07/spring-initialzr/"
           data-tag="tools,工具"
           data-author="沐雨云楼" >
            <span class="post-title" title="spring initialzr环境编译发布">spring initialzr环境编译发布</span>
            <span class="post-date" title="2020-06-07 15:06:00">2020/06/07</span>
        </a>
        
        <a  class="windows "
           href="/blog/2020/06/07/windows-cmd/"
           data-tag="tools,工具,命令"
           data-author="沐雨云楼" >
            <span class="post-title" title="windows常用命令和技巧">windows常用命令和技巧</span>
            <span class="post-date" title="2020-06-07 11:39:01">2020/06/07</span>
        </a>
        
        <a  class="工具 github "
           href="/blog/2020/06/07/tool-connect-github/"
           data-tag="github"
           data-author="沐雨云楼" >
            <span class="post-title" title="github无法访问解决办法">github无法访问解决办法</span>
            <span class="post-date" title="2020-06-07 10:08:38">2020/06/07</span>
        </a>
        
        <a  class="微服务 spring "
           href="/blog/2020/06/03/spring-transaction/"
           data-tag="transactional"
           data-author="沐雨云楼" >
            <span class="post-title" title="Transactional注解">Transactional注解</span>
            <span class="post-date" title="2020-06-03 22:09:42">2020/06/03</span>
        </a>
        
        <a  class="java dao "
           href="/blog/2020/06/03/java-jpa-query-multi-param/"
           data-tag="JPA"
           data-author="沐雨云楼" >
            <span class="post-title" title="JPA多条件查询">JPA多条件查询</span>
            <span class="post-date" title="2020-06-03 21:38:04">2020/06/03</span>
        </a>
        
        <a  class="java tools "
           href="/blog/2020/06/02/java-tool-fastjson-bug/"
           data-tag="fastjson"
           data-author="沐雨云楼" >
            <span class="post-title" title="Fastjson远程代码高危漏洞">Fastjson远程代码高危漏洞</span>
            <span class="post-date" title="2020-06-02 22:38:28">2020/06/02</span>
        </a>
        
        <a  class="linux docker "
           href="/blog/2020/06/02/docker-introduction/"
           data-tag="docker"
           data-author="沐雨云楼" >
            <span class="post-title" title="👍docker入门看这一篇就够了">👍docker入门看这一篇就够了</span>
            <span class="post-date" title="2020-06-02 22:22:30">2020/06/02</span>
        </a>
        
        <a  class="面试 "
           href="/blog/2020/06/02/interview-java-question/"
           data-tag="面试"
           data-author="沐雨云楼" >
            <span class="post-title" title="java常见面试题">java常见面试题</span>
            <span class="post-date" title="2020-06-02 22:15:48">2020/06/02</span>
        </a>
        
        <a  class="微服务 "
           href="/blog/2020/06/02/microservice-why/"
           data-tag="微服务"
           data-author="沐雨云楼" >
            <span class="post-title" title="牛逼公司一定要使用微服务？">牛逼公司一定要使用微服务？</span>
            <span class="post-date" title="2020-06-02 21:15:29">2020/06/02</span>
        </a>
        
        <a  class="面试 "
           href="/blog/2020/06/02/interview-company/"
           data-tag="面试"
           data-author="沐雨云楼" >
            <span class="post-title" title="IT互联网公司名单">IT互联网公司名单</span>
            <span class="post-date" title="2020-06-02 20:41:23">2020/06/02</span>
        </a>
        
        <a  class="java 并发 "
           href="/blog/2020/06/02/java-concurrent-aqs/"
           data-tag="java,并发"
           data-author="沐雨云楼" >
            <span class="post-title" title="java并发AQS">java并发AQS</span>
            <span class="post-date" title="2020-06-02 20:01:48">2020/06/02</span>
        </a>
        
        <a  class="ai "
           href="/blog/2020/05/30/ai-learn-catalog/"
           data-tag="ai"
           data-author="沐雨云楼" >
            <span class="post-title" title="AI学习目录">AI学习目录</span>
            <span class="post-date" title="2020-05-30 08:49:39">2020/05/30</span>
        </a>
        
        <a  class="工具 hexo "
           href="/blog/2020/05/29/hexo-theme-math/"
           data-tag="工具,hexo"
           data-author="沐雨云楼" >
            <span class="post-title" title="hexo主题支持MathJax">hexo主题支持MathJax</span>
            <span class="post-date" title="2020-05-29 09:36:46">2020/05/29</span>
        </a>
        
        <a  class="db redis "
           href="/blog/2020/05/28/redis-memory-eliminate/"
           data-tag="redis"
           data-author="沐雨云楼" >
            <span class="post-title" title="Redis的内存淘汰策略">Redis的内存淘汰策略</span>
            <span class="post-date" title="2020-05-28 21:37:04">2020/05/28</span>
        </a>
        
        <a  class="工具 markdowm "
           href="/blog/2020/05/28/markdown-math-express/"
           data-tag="工具,markdowm"
           data-author="沐雨云楼" >
            <span class="post-title" title="markdown数学公式">markdown数学公式</span>
            <span class="post-date" title="2020-05-28 08:02:03">2020/05/28</span>
        </a>
        
        <a  class="python "
           href="/blog/2020/05/21/python_install/"
           data-tag="python"
           data-author="沐雨云楼" >
            <span class="post-title" title="python环境安装">python环境安装</span>
            <span class="post-date" title="2020-05-21 21:37:04">2020/05/21</span>
        </a>
        
        <a  class="python "
           href="/blog/2020/05/21/python_base/"
           data-tag="python"
           data-author="沐雨云楼" >
            <span class="post-title" title="python基本语法">python基本语法</span>
            <span class="post-date" title="2020-05-21 00:00:00">2020/05/21</span>
        </a>
        
        <a  class="其他 "
           href="/blog/2020/05/20/todo-list/"
           data-tag="plan,计划"
           data-author="沐雨云楼" >
            <span class="post-title" title="计划">计划</span>
            <span class="post-date" title="2020-05-20 11:04:19">2020/05/20</span>
        </a>
        
        <a  class="工具 "
           href="/blog/2020/05/20/blog-template/"
           data-tag="模板"
           data-author="沐雨云楼" >
            <span class="post-title" title="博客自定义模板">博客自定义模板</span>
            <span class="post-date" title="2020-05-20 05:20:00">2020/05/20</span>
        </a>
        
    </nav>
</div>

    </div>
    <div class="hide-list">
        <div class="semicircle" data-title="切换全屏 快捷键 s">
            <div class="brackets first"><</div>
            <div class="brackets">&gt;</div>
        </div>
    </div>
</aside>
<div class="post">
    <div class="pjax">
        <article id="post-spark-data-skew" class="article article-type-post" itemscope itemprop="blogPost">
    
        <h1 class="article-title">spark数据倾斜及解决方案</h1>
    
    <div class="article-meta">
        
        
        <span class="author"><a>沐雨云楼</a></span>
        
        
        <span class="book">
            
                <a  data-rel="大数据">大数据</a>/
            
                <a  data-rel="spark">spark</a>
            
        </span>
        
        
        <span class="tag">
            
            <a class="color1">spark</a>
            
        </span>
        
    </div>
    <div class="article-meta">
        
            发布时间 : <time class="date" title='最后更新: 2020-09-12 21:21:47'>2020-06-20 21:41</time>
        
    </div>
    <div class="article-meta">
        
        <span>字数:4.8k</span>
        
        
        <span id="busuanzi_container_page_pv">
            阅读 :<span id="busuanzi_value_page_pv">
                <span class="count-comment">
                    <span class="spinner">
                      <div class="cube1"></div>
                      <div class="cube2"></div>
                    </span>
                </span>
            </span>
        </span>
        
        
        <span class="top-comment" title="跳转至评论区">
            <a href="#comments">
                评论:<span class="count-comment">
                    <span class="spinner">
                      <div class="cube1"></div>
                      <div class="cube2"></div>
                    </span>
                </span>
            </a>
        </span>
        
    </div>
    
    <div class="article-meta">
        <span class="origin">来源:
            <a href="https://yq.aliyun.com/articles/741111" target="_blank">
                
                Spark数据倾斜及其解决方案
                
            </a>
        </span>
    </div>
    
    <div class="toc-ref">
    
        <ol class="toc"><li class="toc-item toc-level-2"><a class="toc-link" href="#1-什么是数据倾斜"><span class="toc-text">1.什么是数据倾斜</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-数据倾斜的危害"><span class="toc-text">2.数据倾斜的危害</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#3-数据倾斜的现象"><span class="toc-text">3.数据倾斜的现象</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#4-数据倾斜的原因"><span class="toc-text">4.数据倾斜的原因</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#5-问题发现与定位"><span class="toc-text">5.问题发现与定位</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#5-1-Spark-Web-UI"><span class="toc-text">5-1.Spark Web UI</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#5-2-通过key统计"><span class="toc-text">5-2.通过key统计</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#6-如何缓解数据倾斜"><span class="toc-text">6.如何缓解数据倾斜</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#6-1-基本思路"><span class="toc-text">6-1.基本思路</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-2-思路1-过滤异常数据"><span class="toc-text">6-2.思路1-过滤异常数据</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-3-思路2-提高shuffle并行度"><span class="toc-text">6-3.思路2-提高shuffle并行度</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-4-思路3-自定义Partitioner"><span class="toc-text">6-4.思路3-自定义Partitioner</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-5-思路4-Reduce端Join转化为Map端Join"><span class="toc-text">6-5.思路4-Reduce端Join转化为Map端Join</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-6-思路5-拆分join再union"><span class="toc-text">6-6.思路5-拆分join再union</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-7-思路6-大表key加盐小表扩大N倍jion"><span class="toc-text">6-7.思路6-大表key加盐小表扩大N倍jion</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-8-思路7-map端先局部聚合"><span class="toc-text">6-8.思路7-map端先局部聚合</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#6-9-思路8-加盐局部聚合-去盐全局聚合"><span class="toc-text">6-9.思路8-加盐局部聚合+去盐全局聚合</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#7-Hadoop中的数据倾斜"><span class="toc-text">7.Hadoop中的数据倾斜</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#8-参考文章"><span class="toc-text">8.参考文章</span></a></li></ol>
    
<style>
    .left-col .switch-btn,
    .left-col .switch-area {
        display: none;
    }
    .toc-level-4 i,
    .toc-level-4 ol {
        display: none !important;
    }
</style>
</div>
    
    <div class="article-entry" itemprop="articleBody">
      
        <h2 id="1-什么是数据倾斜"><a href="#1-什么是数据倾斜" class="headerlink" title="1.什么是数据倾斜"></a>1.什么是数据倾斜</h2><p>对 Spark/Hadoop 这样的分布式大数据系统来讲，数据量大并不可怕，可怕的是数据倾斜。</p>
<p>对于分布式系统而言，理想情况下，随着系统规模（节点数量）的增加，应用整体耗时线性下降。如果一台机器处理一批大量数据需要120分钟，当机器数量增加到3台时，理想的耗时为120 / 3 = 40分钟。但是，想做到分布式情况下每台机器执行时间是单机时的1 / N，就必须保证每台机器的任务量相等。不幸的是，很多时候，任务的分配是不均匀的，甚至不均匀到大部分任务被分配到个别机器上，其它大部分机器所分配的任务量只占总得的小部分。比如一台机器负责处理 80% 的任务，另外两台机器各处理 10% 的任务。</p>
<p>『不患多而患不均』，这是分布式环境下最大的问题。意味着计算能力不是线性扩展的，而是存在短板效应: <strong>一个Stage所耗费的时间，是由最慢的那个Task决定</strong>。</p>
<p>由于同一个Stage内的所有task执行相同的计算，在排除不同计算节点计算能力差异的前提下，不同task之间耗时的差异主要由该task所处理的数据量决定。所以，要想发挥分布式系统并行计算的优势，就必须解决数据倾斜问题。</p>
<h2 id="2-数据倾斜的危害"><a href="#2-数据倾斜的危害" class="headerlink" title="2.数据倾斜的危害"></a>2.数据倾斜的危害</h2><p>当出现数据倾斜时，<strong>小量任务耗时远高于其它任务</strong>，从而使得整体耗时过大，未能充分发挥分布式系统的并行计算优势。　　</p>
<p>另外，当发生数据倾斜时，部分任务处理的数据量过大，可能造成<strong>内存不足使得任务失败</strong>，并进而引进整个应用失败。　　</p>
<h2 id="3-数据倾斜的现象"><a href="#3-数据倾斜的现象" class="headerlink" title="3.数据倾斜的现象"></a>3.数据倾斜的现象</h2><p>当发现如下现象时，十有八九是发生数据倾斜了:</p>
<ul>
<li>绝大多数 task 执行得都非常快，但个别 task 执行极慢，整体任务卡在某个阶段不能结束。</li>
<li>原本能够正常执行的 Spark 作业，某天突然报出 OOM（内存溢出）异常，观察异常栈，是我们写的业务代码造成的。这种情况比较少见。</li>
</ul>
<div class="admonition note">
<p class="fa admonition-title">提示</p>
<p>
    在 Spark streaming 程序中，数据倾斜更容易出现，特别是在程序中包含一些类似 sql 的 join、group 这种操作的时候。
    因为 Spark Streaming 程序在运行的时候，我们一般不会分配特别多的内存，因此一旦在这个过程中出现一些数据倾斜，就十分容易造成 OOM。
</p>
</div>


<h2 id="4-数据倾斜的原因"><a href="#4-数据倾斜的原因" class="headerlink" title="4.数据倾斜的原因"></a>4.数据倾斜的原因</h2><p>在进行 shuffle 的时候，必须将各个节点上相同的 key 拉取到某个节点上的一个 task 来进行处理，比如按照 key 进行聚合或 join 等操作。此时如果某个 key 对应的数据量特别大的话，就会发生数据倾斜。</p>
<blockquote>
<p>比如大部分 key 对应10条数据，但是个别 key 却对应了100万条数据，那么大部分 task 可能就只会分配到10条数据，然后1秒钟就运行完了；但是个别 task 可能分配到了100万数据，要运行一两个小时。</p>
</blockquote>
<p>因此出现数据倾斜的时候，Spark 作业看起来会运行得非常缓慢，甚至可能因为某个 task 处理的数据量过大导致内存溢出。</p>
<h2 id="5-问题发现与定位"><a href="#5-问题发现与定位" class="headerlink" title="5.问题发现与定位"></a>5.问题发现与定位</h2><h3 id="5-1-Spark-Web-UI"><a href="#5-1-Spark-Web-UI" class="headerlink" title="5-1.Spark Web UI"></a>5-1.Spark Web UI</h3><p>通过 Spark Web UI 来查看当前运行的 stage 各个 task 分配的数据量（Shuffle Read Size/Records），从而进一步确定是不是 task 分配的数据不均匀导致了数据倾斜。</p>
<p>知道数据倾斜发生在哪一个 stage 之后，接着我们就需要根据 stage 划分原理，推算出来发生倾斜的那个 stage 对应代码中的哪一部分，这部分代码中肯定会有一个 shuffle 类算子。可以通过 countByKey 查看各个 key 的分布。</p>
<div class="admonition note">
<p class="fa admonition-title">提示</p>
<p>
    数据倾斜只会发生在 shuffle 过程中。这里给大家罗列一些常用的并且可能会触发 shuffle 操作的算子: 
    distinct、groupByKey、reduceByKey、aggregateByKey、join、cogroup、repartition 等。
    出现数据倾斜时，可能就是你的代码中使用了这些算子中的某一个所导致的。
</p>
</div>


<h3 id="5-2-通过key统计"><a href="#5-2-通过key统计" class="headerlink" title="5-2.通过key统计"></a>5-2.通过key统计</h3><p>也可以通过抽样统计key的出现次数验证。</p>
<p>由于数据量巨大，可以采用抽样的方式，对数据进行抽样，统计出现的次数，根据出现次数大小排序取出前几个:</p>
<pre><code class="scala">df.select(&quot;key&quot;).sample(false, 0.1)           // 数据采样
    .(k =&gt; (k, 1)).reduceBykey(_ + _)         // 统计 key 出现的次数
    .map(k =&gt; (k._2, k._1)).sortByKey(false)  // 根据 key 出现次数进行排序
    .take(10)                                 // 取前 10 个。</code></pre>
<p>如果发现多数数据分布都较为平均，而个别数据比其他数据大上若干个数量级，则说明发生了数据倾斜。</p>
<h2 id="6-如何缓解数据倾斜"><a href="#6-如何缓解数据倾斜" class="headerlink" title="6.如何缓解数据倾斜"></a>6.如何缓解数据倾斜</h2><h3 id="6-1-基本思路"><a href="#6-1-基本思路" class="headerlink" title="6-1.基本思路"></a>6-1.基本思路</h3><p>主要从三个方面来考虑：</p>
<ul>
<li><strong>业务逻辑</strong>: <blockquote>
<p>我们从业务逻辑的层面上来优化数据倾斜，比如要统计不同城市的订单情况，那么我们单独对这一线城市来做count，最后和其它城市做整合。</p>
</blockquote>
</li>
<li><strong>程序实现</strong>: <blockquote>
<p>比如说在 Hive 中，经常遇到 count（distinct）操作，这样会导致最终只有一个 reduce，我们可以先 group 再在外面包一层 count，就可以了；在 Spark 中使用 reduceByKey 替代 groupByKey 等。</p>
</blockquote>
</li>
<li><strong>参数调优</strong>: <blockquote>
<p>Hadoop 和 Spark 都自带了很多的参数和机制来调节数据倾斜，合理利用它们就能解决大部分问题。</p>
</blockquote>
</li>
</ul>
<h3 id="6-2-思路1-过滤异常数据"><a href="#6-2-思路1-过滤异常数据" class="headerlink" title="6-2.思路1-过滤异常数据"></a>6-2.思路1-过滤异常数据</h3><p>如果导致数据倾斜的 key 是异常数据，那么简单的过滤掉就可以了。</p>
<p>首先要对 key 进行分析，判断是哪些 key 造成数据倾斜。具体方法上面已经介绍过了，这里不赘述。</p>
<p>然后对这些 key 对应的记录进行分析:</p>
<ol>
<li>空值或者异常值之类的，大多是这个原因引起</li>
<li>无效数据，大量重复的测试数据或是对结果影响不大的有效数据</li>
<li>有效数据，业务导致的正常数据分布</li>
</ol>
<p><strong>解决方案</strong></p>
<p>对于第 1，2 种情况，直接对数据进行过滤即可。</p>
<p>第3种情况则需要特殊的处理，具体我们下面详细介绍。</p>
<h3 id="6-3-思路2-提高shuffle并行度"><a href="#6-3-思路2-提高shuffle并行度" class="headerlink" title="6-3.思路2-提高shuffle并行度"></a>6-3.思路2-提高shuffle并行度</h3><p>Spark 在做 Shuffle 时，默认使用 HashPartitioner（非 Hash Shuffle）对数据进行分区。如果并行度设置的不合适，可能造成大量不相同的 Key 对应的数据被分配到了同一个 Task 上，造成该 Task 所处理的数据远大于其它 Task，从而造成数据倾斜。</p>
<p>如果调整 Shuffle 时的并行度，使得原本被分配到同一 Task 的不同 Key 发配到不同 Task 上处理，则可降低原 Task 所需处理的数据量，从而缓解数据倾斜问题造成的短板效应。</p>
<p><strong>a.操作流程</strong></p>
<p>RDD操作可在需要Shuffle的操作算子上直接设置并行度或者使用spark.default.parallelism设置。<br>如果是Spark SQL还可通过<code>SET spark.sql.shuffle.partitions=[num_tasks]</code>设置并行度。默认参数由不同的Cluster Manager控制。</p>
<p>dataFrame和sparkSql可以设置<code>spark.sql.shuffle.partitions=[num_tasks]</code>参数控制 shuffle 的并发度，默认为200。</p>
<p><strong>b.适用场景</strong></p>
<p>大量不同的Key被分配到了相同的Task造成该Task数据量过大。</p>
<p><strong>c.解决方案</strong></p>
<p>调整并行度。一般是增大并行度，但有时如减小并行度也可达到效果。</p>
<p><strong>d.优势</strong></p>
<p>实现简单，只需要参数调优。可用最小的代价解决问题。一般如果出现数据倾斜，都可以通过这种方法先试验几次，如果问题未解决，再尝试其它方法。</p>
<p><strong>e.劣势</strong></p>
<p>适用场景少，只是让每个 task 执行更少的不同的key。无法解决个别key特别大的情况造成的倾斜，如果某些key的大小非常大，即使一个 task 单独执行它，也会受到数据倾斜的困扰。<br>并且该方法一般只能缓解数据倾斜，没有彻底消除问题。从实践经验来看，其效果一般。</p>
<blockquote>
<p>可以把数据倾斜类比为hash冲突。提高并行度就类似于提高hash表的大小。</p>
</blockquote>
<h3 id="6-4-思路3-自定义Partitioner"><a href="#6-4-思路3-自定义Partitioner" class="headerlink" title="6-4.思路3-自定义Partitioner"></a>6-4.思路3-自定义Partitioner</h3><p><strong>原理</strong></p>
<p>使用自定义的 Partitioner（默认为 HashPartitioner），将原本被分配到同一个Task的不同 Key分配到不同Task。</p>
<p>例如，我们在<code>groupByKey</code>算子上，使用自定义的 Partitioner:</p>
<pre><code class="scala">.groupByKey(new Partitioner() {
  @Override
  public int numPartitions() {
    return 12;
  }

  @Override
  public int getPartition(Object key) {
    int id = Integer.parseInt(key.toString());
    if(id &gt;= 9500000 &amp;&amp; id &lt;= 9500084 &amp;&amp; ((id - 9500000) % 12) == 0) {
      return (id - 9500000) / 12;
    } else {
      return id % 12;
    }
  }
})</code></pre>
<blockquote>
<p>这个做法相当于自定义hash表的哈希函数。</p>
</blockquote>
<p><strong>b.适用场景</strong></p>
<p>大量不同的 Key 被分配到了相同的 Task 造成该 Task 数据量过大。</p>
<p><strong>c.解决方案</strong></p>
<p>使用自定义的 Partitioner 实现类代替默认的 HashPartitioner，尽量将所有不同的 Key 均匀分配到不同的 Task 中。</p>
<p><strong>d.优势</strong></p>
<p>不影响原有的并行度设计。如果改变并行度，后续Stage的并行度也会默认改变，可能会影响后续Stage。</p>
<p><strong>e.劣势</strong></p>
<p>适用场景有限，只能将不同Key分散开，对于同一Key对应数据集非常大的场景不适用。<br>效果与调整并行度类似，只能缓解数据倾斜而不能完全消除数据倾斜。而且需要根据数据特点自定义专用的 Partitioner，不够灵活。</p>
<h3 id="6-5-思路4-Reduce端Join转化为Map端Join"><a href="#6-5-思路4-Reduce端Join转化为Map端Join" class="headerlink" title="6-5.思路4-Reduce端Join转化为Map端Join"></a>6-5.思路4-Reduce端Join转化为Map端Join</h3><p>通过Spark的<strong>Broadcast</strong>机制，将Reduce端Join转化为Map端Join，这意味着Spark现在不需要跨节点做shuffle而是直接通过本地文件进行join，从而完全消除Shuffle带来的数据倾斜。</p>
<p><img src="/blog/images/bigdata/spark/spark_data_skew_join.png" alt="img"></p>
<pre><code class="scala">from pyspark.sql.functions import broadcast
result = broadcast(A).join(B, [&quot;join_col&quot;], &quot;left&quot;)</code></pre>
<p>其中 A 是比较小的 dataframe 并且能够整个存放在 executor 内存中。</p>
<p><strong>a.适用场景</strong></p>
<p>参与Join的一边数据集足够小，可被加载进 Driver 并通过 Broadcast 方法广播到各个 Executor 中。</p>
<p><strong>b.解决方案</strong></p>
<p>在 Java/Scala 代码中将小数据集数据拉取到 Driver，然后通过 Broadcast 方案将小数据集的数据广播到各 Executor。<br>或者在使用 SQL 前，将 Broadcast 的阈值调整得足够大，从而使 Broadcast 生效。进而将Reduce Join替换为Map Join。</p>
<p><strong>c.优势</strong></p>
<p>避免了 Shuffle，彻底消除了数据倾斜产生的条件，可极大提升性能。</p>
<p><strong>d.劣势</strong></p>
<p>因为是先将小数据通过 Broadcase 发送到每个 executor 上，所以需要参与Join的一方数据集足够小，并且主要适用于Join的场景，不适合聚合的场景，适用条件有限。</p>
<p>使用Spark SQL时需要通过<code>SET spark.sql.autoBroadcastJoinThreshold=104857600</code>将Broadcast的阈值设置得足够大，才会生效。</p>
<h3 id="6-6-思路5-拆分join再union"><a href="#6-6-思路5-拆分join再union" class="headerlink" title="6-6.思路5-拆分join再union"></a>6-6.思路5-拆分join再union</h3><p>思路很简单，就是将一个join拆分成 <strong>倾斜数据集Join</strong> 和 <strong>非倾斜数据集Join</strong>，最后进行union:</p>
<ol>
<li>对包含少数几个数据量过大的 key 的那个 RDD (假设是 leftRDD)，通过 sample 算子采样出一份样本来，然后统计一下每个 key 的数量，计算出来数据量最大的是哪几个 key。具体方法上面已经介绍过了，这里不赘述。</li>
<li>然后将这 k 个 key 对应的数据从 leftRDD 中单独过滤出来，并给每个 key 都打上 1~n 以内的随机数作为前缀，形成一个单独的 leftSkewRDD；而不会导致倾斜的大部分 key 形成另外一个 leftUnSkewRDD。</li>
<li>接着将需要 join 的另一个 rightRDD，也过滤出来那几个倾斜 key 并通过 flatMap 操作将该数据集中每条数据均转换为 n 条数据（这 n 条数据都按顺序附加一个 0~n 的前缀），形成单独的 rightSkewRDD；不会导致倾斜的大部分 key 也形成另外一个 rightUnSkewRDD。</li>
<li>现在将 leftSkewRDD 与 膨胀 n 倍的 rightSkewRDD 进行 join，且在 Join 过程中将随机前缀去掉，得到倾斜数据集的 Join 结果 skewedJoinRDD。注意到此时我们已经成功将原先相同的 key 打散成 n 份，分散到多个 task 中去进行 join 了。</li>
<li>对 leftUnSkewRDD 与 rightUnRDD 进行Join，得到 Join 结果 unskewedJoinRDD。</li>
<li>通过 union 算子将 skewedJoinRDD 与 unskewedJoinRDD 进行合并，从而得到完整的 Join 结果集。</li>
</ol>
<div class="admonition note">
<p class="fa admonition-title">通知</p>
<p>
    <li>rightRDD 与倾斜 Key 对应的部分数据，需要与随机前缀集 (1~n) 作笛卡尔乘积 (即将数据量扩大 n 倍），从而保证无论数据倾斜侧倾斜 Key 如何加前缀，都能与之正常 Join。</li>
    <li>skewRDD 的 join 并行度可以设置为 n * k (k 为 topSkewkey 的个数)。</li>
    <li>由于倾斜Key与非倾斜Key的操作完全独立，可并行进行。</li>
</p>
</div>

<p><strong>a.适用场景</strong></p>
<p>两张表都比较大，无法使用 Map 端 Join。其中一个 RDD 有少数几个 Key 的数据量过大，另外一个 RDD 的 Key 分布较为均匀。</p>
<p><strong>b.解决方案</strong></p>
<p>将有数据倾斜的 RDD 中倾斜 Key 对应的数据集单独抽取出来加上随机前缀，另外一个 RDD 每条数据分别与随机前缀结合形成新的RDD（相当于将其数据增到到原来的N倍，N即为随机前缀的总个数），然后将二者Join并去掉前缀。然后将不包含倾斜Key的剩余数据进行Join。最后将两次Join的结果集通过union合并，即可得到全部Join结果。</p>
<p><strong>c.优势</strong></p>
<p>相对于 Map 则 Join，更能适应大数据集的 Join。如果资源充足，倾斜部分数据集与非倾斜部分数据集可并行进行，效率提升明显。且只针对倾斜部分的数据做数据扩展，增加的资源消耗有限。</p>
<p><strong>d.劣势</strong></p>
<p>如果倾斜 Key 非常多，则另一侧数据膨胀非常大，此方案不适用。而且此时对倾斜 Key 与非倾斜 Key 分开处理，需要扫描数据集两遍，增加了开销。</p>
<h3 id="6-7-思路6-大表key加盐小表扩大N倍jion"><a href="#6-7-思路6-大表key加盐小表扩大N倍jion" class="headerlink" title="6-7.思路6-大表key加盐小表扩大N倍jion"></a>6-7.思路6-大表key加盐小表扩大N倍jion</h3><p>如果出现数据倾斜的 Key 比较多，上一种方法将这些大量的倾斜 Key 分拆出来，意义不大。此时更适合直接对存在数据倾斜的数据集全部加上随机前缀，然后对另外一个不存在严重数据倾斜的数据集整体与随机前缀集作笛卡尔乘积（即将数据量扩大N倍）。</p>
<p>其实就是上一个方法的特例或者简化。少了拆分，也就没有 union。</p>
<p><strong>a.适用场景</strong></p>
<p>一个数据集存在的倾斜 Key 比较多，另外一个数据集数据分布比较均匀。</p>
<p><strong>b.优势</strong></p>
<p>对大部分场景都适用，效果不错。</p>
<p><strong>c.劣势</strong></p>
<p>需要将一个数据集整体扩大 N 倍，会增加资源消耗。</p>
<h3 id="6-8-思路7-map端先局部聚合"><a href="#6-8-思路7-map端先局部聚合" class="headerlink" title="6-8.思路7-map端先局部聚合"></a>6-8.思路7-map端先局部聚合</h3><p>在 map 端加个 combiner 函数进行局部聚合。加上 combiner 相当于提前进行 reduce ,就会把一个 mapper 中的相同 key 进行聚合，减少 shuffle 过程中数据量 以及 reduce 端的计算量。</p>
<p>这种方法可以有效的缓解数据倾斜问题，但是如果导致数据倾斜的 key 大量分布在不同的 mapper 的时候，这种方法就不是很有效了。</p>
<blockquote>
<p>使用 reduceByKey 而不是 groupByKey。</p>
</blockquote>
<h3 id="6-9-思路8-加盐局部聚合-去盐全局聚合"><a href="#6-9-思路8-加盐局部聚合-去盐全局聚合" class="headerlink" title="6-9.思路8-加盐局部聚合+去盐全局聚合"></a>6-9.思路8-加盐局部聚合+去盐全局聚合</h3><p>这个方案的核心实现思路就是进行两阶段聚合。</p>
<ul>
<li>第一次是局部聚合，先给每个 key 都打上一个 1~n 的随机数，<blockquote>
<p>比如 3 以内的随机数，此时原先一样的 key 就变成不一样的了，比如 (hello, 1) (hello, 1) (hello, 1) (hello, 1) (hello, 1)，就会变成 (1_hello, 1) (3_hello, 1) (2_hello, 1) (1_hello, 1) (2_hello, 1)。</p>
</blockquote>
</li>
<li>接着对打上随机数后的数据，执行 reduceByKey 等聚合操作，进行局部聚合.<blockquote>
<p>那么局部聚合结果，就会变成了 (1_hello, 2) (2_hello, 2) (3_hello, 1)。然后将各个 key 的前缀给去掉，就会变成 (hello, 2) (hello, 2) (hello, 1)</p>
</blockquote>
</li>
<li>再次进行全局聚合操作,就可以得到最终结果。<blockquote>
<p>比如 (hello, 5)。</p>
</blockquote>
</li>
</ul>
<pre><code class="scala">def antiSkew(): RDD[(String, Int)] = {
    val SPLIT = &quot;-&quot;
    val prefix = new Random().nextInt(10)
    pairs.map(t =&gt; ( prefix + SPLIT + t._1, 1))
        .reduceByKey((v1, v2) =&gt; v1 + v2)
        .map(t =&gt; (t._1.split(SPLIT)(1), t2._2))
        .reduceByKey((v1, v2) =&gt; v1 + v2)
}</code></pre>
<p>不过进行两次 mapreduce，性能稍微比一次的差些。</p>
<h2 id="7-Hadoop中的数据倾斜"><a href="#7-Hadoop中的数据倾斜" class="headerlink" title="7.Hadoop中的数据倾斜"></a>7.Hadoop中的数据倾斜</h2><p>Hadoop 中直接贴近用户使用的是 Mapreduce 程序和 Hive 程序，虽说 Hive 最后也是用 MR 来执行（至少目前 Hive 内存计算并不普及），但是毕竟写的内容逻辑区别很大，一个是程序，一个是Sql，因此这里稍作区分。</p>
<p>Hadoop 中的数据倾斜主要表现在 ruduce 阶段卡在99.99%，一直99.99%不能结束。</p>
<p>这里如果详细的看日志或者和监控界面的话会发现:</p>
<ul>
<li>有一个多几个 reduce 卡住</li>
<li>各种 container报错 OOM</li>
<li>读写的数据量极大，至少远远超过其它正常的 reduce</li>
<li>伴随着数据倾斜，会出现任务被 kill 等各种诡异的表现。</li>
</ul>
<p><strong>经验:</strong></p>
<p> Hive的数据倾斜，一般都发生在 Sql 中 Group 和 On 上，而且和数据逻辑绑定比较深。</p>
<p><strong>优化方法</strong></p>
<p>这里列出来一些方法和思路，具体的参数和用法在官网看就行了。</p>
<ol>
<li>map join 方式</li>
<li>count distinct 的操作，先转成 group，再 count</li>
<li>参数调优<blockquote>
<ul>
<li><code>set hive.map.aggr=true</code></li>
<li><code>set hive.groupby.skewindata=true</code></li>
</ul>
</blockquote>
</li>
<li>left semi jion 的使用</li>
<li>设置 map 端输出、中间结果压缩。（不完全是解决数据倾斜的问题，但是减少了 IO 读写和网络传输，能提高很多效率）</li>
</ol>
<div class="admonition note">
<p class="fa admonition-title">说明</p>
<p>
    <code>hive.map.aggr=true:</code> 在map中会做部分聚集操作，效率更高但需要更多的内存。
</p>
<p>
    <code>hive.groupby.skewindata=true:</code>  数据倾斜时负载均衡，当选项设定为true，生成的查询计划会有两个MRJob。第一个MRJob 中，Map的输出结果集合会随机分布到Reduce中，每个Reduce做部分聚合操作，并输出结果，这样处理的结果是相同的GroupBy Key有可能被分发到不同的Reduce中，从而达到负载均衡的目的；第二个MRJob再根据预处理的数据结果按照GroupBy Key分布到Reduce中（这个过程可以保证相同的GroupBy Key被分布到同一个Reduce中），最后完成最终的聚合操作。
</p>
</div>

<h2 id="8-参考文章"><a href="#8-参考文章" class="headerlink" title="8.参考文章"></a>8.参考文章</h2><ul>
<li><a href="http://www.jasongj.com/spark/skew/" target="_blank" rel="noopener">Spark性能优化之道——解决Spark数据倾斜（Data Skew）的N种姿势</a></li>
<li><a href="https://segmentfault.com/a/1190000009166436" target="_blank" rel="noopener">漫谈千亿级数据优化实践：数据倾斜（纯干货）</a></li>
<li><a href="https://www.jianshu.com/p/06b67a3c61a9" target="_blank" rel="noopener">解决spark中遇到的数据倾斜问题</a></li>
</ul>

      
       <hr><span style="font-style: italic;color: gray;"> 转载请注明来源，欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论，也可以邮件至 157162006@qq.com </span>
    </div>
</article>


<p>
    <a  class="dashang" onclick="dashangToggle()">赏</a>
</p>


<div class="article_copyright">
    <p><span class="copy-title">文章标题:</span>spark数据倾斜及解决方案</p>
    <p><span class="copy-title">字数:</span><span class="post-count">4.8k</span></p>
    <p><span class="copy-title">本文作者:</span><a  title="沐雨云楼">沐雨云楼</a></p>
    <p><span class="copy-title">发布时间:</span>2020-06-20, 21:41:12</p>
    <p><span class="copy-title">最后更新:</span>2020-09-12, 21:21:47</p>
    <span class="copy-title">原始链接:</span><a class="post-url" href="/blog/2020/06/20/spark-data-skew/" title="spark数据倾斜及解决方案">https://iworkh.gitee.io/blog/2020/06/20/spark-data-skew/</a>
    <p>
        <span class="copy-title">版权声明:</span><i class="fa fa-creative-commons"></i> <a rel="license noopener" href="http://creativecommons.org/licenses/by-nc-sa/4.0/" target="_blank" title="CC BY-NC-SA 4.0 International" target = "_blank">&#34;署名-非商用-相同方式共享 4.0&#34;</a> 转载请保留原文链接及作者。
    </p>
</div>



    <div id="comments"></div>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.css">

<script type="text/javascript">
    $.getScript('/blog/js/gitalk.js', function () {
        var gitalk = new Gitalk({
            clientID: 'e4a269e1ae1600031361',
            clientSecret: '6f3f981cbc49dd802fde779697d8f01da3a77981',
            repo: 'iworkh.github.io',
            owner: 'iworkh',
            admin: ['iworkh'],
            id: decodeURI(location.pathname),
            distractionFreeMode: 'true',
            language: 'zh-CN',
            perPage: parseInt('10',10)
        })
        gitalk.render('comments')
    })
</script>




    




    </div>
    <div class="copyright">
        <p class="footer-entry">©2020 iworkh</p>
<!--<p class="footer-entry">Built with <a href="https://hexo.io/" target="_blank">Hexo</a> and <a href="https://github.com/yelog/hexo-theme-3-hexo" target="_blank">3-hexo</a> theme</p>-->

    </div>
    <div class="full-toc">
        <button class="full" data-title="切换全屏 快捷键 s"><span class="min "></span></button>
<button class="post-toc-menu" data-title="打开TOC 快捷键 w"><span class="post-toc-menu-icons"></span></button>
<div class="post-toc"><span class="post-toc-title">目录</span>
    <div class="post-toc-content">

    </div>
</div>
<a class="" id="rocket" ></a>

    </div>
</div>
<div class="acParent"></div>

<div class="hide_box" onclick="dashangToggle()"></div>
<div class="shang_box">
    <a class="shang_close"  onclick="dashangToggle()">×</a>
    <div class="shang_tit">
        <p>喜欢就点赞,疼爱就打赏</p>
    </div>
    <div class="shang_payimg">
        <div class="pay_img">
            <img src="/blog/img/alipay.jpg" class="alipay" title="扫码支持">
            <img src="/blog/img/weixin.jpg" class="weixin" title="扫码支持">
        </div>
    </div>
    <div class="shang_payselect">
        <span><label><input type="radio" name="pay" checked value="alipay">支付宝</label></span><span><label><input type="radio" name="pay" value="weixin">微信</label></span>
    </div>
</div>


</body>
<script src="/blog/js/jquery.pjax.js?v=1.0.1" ></script>

<script src="/blog/js/script.js?v=1.0.1" ></script>
<script>
    var img_resize = 'default';
    /*作者、标签的自动补全*/
    $(function () {
        $('.search').AutoComplete({
            'data': ['@沐雨云楼','@iworkh','#ai','#attr','#aysnc','#beanmap','#bigdecimal','#blog','#chrome','#dict','#dll','#docker','#domain','#fastjson','#func','#git','#github','#hadoop','#hexo','#idea','#iworkh','#jackson','#java','#jdni','#jfinalshell','#joplin','#JPA','#lock','#markdowm','#maven','#mock','#mq','#mycat','#mysql','#nginx','#notebook','#plan','#powermock','#python','#redis','#rest','#sharding-jdbc','#spark','#springboot','#test','#tool','#tools','#tornado','#transactional','#web','#windows','#wtforms','#zookeeper','#分库分表','#分页','#命令','#工具','#并发','#序列化','#微服务','#性能分析','#手册','#有道云','#标签','#模板','#百度云盘','#结构和算法','#计划','#迁移','#面试',],
            'itemHeight': 20,
            'width': 418
        }).AutoComplete('show');
    })
    function initArticle() {
        /*渲染对应的表格样式*/
        
            $(".post .pjax table").addClass("green_title");
        

        /*渲染打赏样式*/
        
        $("input[name=pay]").on("click", function () {
            if($("input[name=pay]:checked").val()=="weixin"){
                $(".shang_box .shang_payimg .pay_img").addClass("weixin_img");
            } else {
                $(".shang_box .shang_payimg .pay_img").removeClass("weixin_img");
            }
        })
        

        /*高亮代码块行号*/
        
        $('pre code').each(function(){
            var lines = $(this).text().trim().split('\n').length, widther='';
            if (lines>99) {
                widther = 'widther'
            }
            var $numbering = $('<ul/>').addClass('pre-numbering ' + widther).attr("unselectable","on");
            $(this).addClass('has-numbering ' + widther)
                    .parent()
                    .append($numbering);
            for(var i=1;i<=lines;i++){
                $numbering.append($('<li/>').text(i));
            }
        });
        

        /*访问数量*/
        
        $.getScript("//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js");
        

        /*代码高亮，行号对齐*/
        $('.pre-numbering').css('line-height',$('.has-numbering').css('line-height'));

        
        
    }

    /*打赏页面隐藏与展示*/
    
    function dashangToggle() {
        $(".shang_box").fadeToggle();
        $(".hide_box").fadeToggle();
    }
    

</script>

<!--加入行号的高亮代码块样式-->

<style>
    pre{
        position: relative;
        margin-bottom: 24px;
        border-radius: 10px;
        border: 1px solid #e2dede;
        background: #FFF;
        overflow: hidden;
    }
    code.has-numbering{
        margin-left: 30px;
    }
    code.has-numbering.widther{
        margin-left: 35px;
    }
    .pre-numbering{
        margin: 0px;
        position: absolute;
        top: 0;
        left: 0;
        width: 20px;
        padding: 0.5em 3px 0.7em 5px;
        border-right: 1px solid #C3CCD0;
        text-align: right;
        color: #AAA;
        background-color: #fafafa;
    }
    .pre-numbering.widther {
        width: 35px;
    }
</style>

<!--自定义样式设置-->
<style>
    
    
    .nav {
        width: 442px;
    }
    .nav.fullscreen {
        margin-left: -442px;
    }
    .nav-left {
        width: 120px;
    }
    
    
    @media screen and (max-width: 1468px) {
        .nav {
            width: 442px;
        }
        .nav.fullscreen {
            margin-left: -442px;
        }
        .nav-left {
            width: 150px;
        }
    }
    
    
    @media screen and (max-width: 1024px) {
        .nav {
            width: 442px;
            margin-left: -442px
        }
        .nav.fullscreen {
            margin-left: 0;
        }
        .nav .hide-list.fullscreen {
            left: 442px
        }
    }
    
    @media screen and (max-width: 426px) {
        .nav {
            width: 100%;
        }
        .nav-left {
            width: 100%;
        }
    }
    
    
    .nav-right .title-list nav a .post-title, .nav-right .title-list #local-search-result a .post-title {
        color: #383636;
    }
    
    
    .nav-right .title-list nav a .post-date, .nav-right .title-list #local-search-result a .post-date {
        color: #5e5e5f;
    }
    
    
    .nav-right nav a.hover, #local-search-result a.hover{
        background-color: #e2e0e0;
    }
    
    

    /*列表样式*/
    
    .post .pjax article .article-entry>ol, .post .pjax article .article-entry>ul, .post .pjax article>ol, .post .pjax article>ul{
        border: #e2dede solid 1px;
        border-radius: 10px;
        padding: 10px 32px 10px 56px;
    }
    .post .pjax article .article-entry li>ol, .post .pjax article .article-entry li>ul,.post .pjax article li>ol, .post .pjax article li>ul{
        padding-top: 5px;
        padding-bottom: 5px;
    }
    .post .pjax article .article-entry>ol>li, .post .pjax article .article-entry>ul>li,.post .pjax article>ol>li, .post .pjax article>ul>li{
        margin-bottom: auto;
        margin-left: auto;
    }
    .post .pjax article .article-entry li>ol>li, .post .pjax article .article-entry li>ul>li,.post .pjax article li>ol>li, .post .pjax article li>ul>li{
        margin-bottom: auto;
        margin-left: auto;
    }
    

    /* 背景图样式 */
    
    


    /*引用块样式*/
    

    /*文章列表背景图*/
    

    
</style>






<div class="mobile-menus-out" >

</div>
<div class="mobile-menus">
    
    
    
    <a class="dynamic-menu " target="_blank"  href="https://pgmanor.gitee.io/blog/">pgmanor</a>
    
    
    <a class="dynamic-menu " target="_blank"  href="https://www.iworkh.com/manualIt/Category/scopeDev/">iworkh</a>
    
    
    <a class="dynamic-menu " target="_blank"  href="https://gitee.com/iworkh">gitee</a>
    
</div>


<div style="position:absolute; bottom: 0; right: 0;">
    <iframe frameborder="no" border="0" marginwidth="0" marginheight="0" width=150 height=40 src="//music.163.com/outchain/player?type=2&id=1323150930&auto=0&height=32"></iframe>
</div>

</html>
